{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Import necessary dependencies and settings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import re\n",
    "import nltk"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Sample corpus of text documents"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Document</th>\n",
       "      <th>Category</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>The sky is blue and beautiful.</td>\n",
       "      <td>weather</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Love this blue and beautiful sky!</td>\n",
       "      <td>weather</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>The quick brown fox jumps over the lazy dog.</td>\n",
       "      <td>animals</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>The brown fox is quick and the blue dog is lazy!</td>\n",
       "      <td>animals</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>The sky is very blue and the sky is very beaut...</td>\n",
       "      <td>weather</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>The dog is lazy but the brown fox is quick!</td>\n",
       "      <td>animals</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                            Document Category\n",
       "0                     The sky is blue and beautiful.  weather\n",
       "1                  Love this blue and beautiful sky!  weather\n",
       "2       The quick brown fox jumps over the lazy dog.  animals\n",
       "3   The brown fox is quick and the blue dog is lazy!  animals\n",
       "4  The sky is very blue and the sky is very beaut...  weather\n",
       "5        The dog is lazy but the brown fox is quick!  animals"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "corpus = ['The sky is blue and beautiful.',\n",
    "          'Love this blue and beautiful sky!',\n",
    "          'The quick brown fox jumps over the lazy dog.',\n",
    "          'The brown fox is quick and the blue dog is lazy!',\n",
    "          'The sky is very blue and the sky is very beautiful today',\n",
    "          'The dog is lazy but the brown fox is quick!'    \n",
    "]\n",
    "labels = ['weather', 'weather', 'animals', 'animals', 'weather', 'animals']\n",
    "corpus = np.array(corpus)\n",
    "corpus_df = pd.DataFrame({'Document': corpus, \n",
    "                          'Category': labels})\n",
    "corpus_df = corpus_df[['Document', 'Category']]\n",
    "corpus_df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Simple text pre-processing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "wpt = nltk.WordPunctTokenizer()\n",
    "stop_words = nltk.corpus.stopwords.words('english')\n",
    "\n",
    "def normalize_document(doc):\n",
    "    # lower case and remove special characters\\whitespaces\n",
    "    doc = re.sub(r'[^a-zA-Z0-9\\s]', '', doc, re.I)\n",
    "    doc = doc.lower()\n",
    "    doc = doc.strip()\n",
    "    # tokenize document\n",
    "    tokens = wpt.tokenize(doc)\n",
    "    # filter stopwords out of document\n",
    "    filtered_tokens = [token for token in tokens if token not in stop_words]\n",
    "    # re-create document from filtered tokens\n",
    "    doc = ' '.join(filtered_tokens)\n",
    "    return doc\n",
    "\n",
    "normalize_corpus = np.vectorize(normalize_document)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['sky blue beautiful', 'love blue beautiful sky',\n",
       "       'quick brown fox jumps lazy dog', 'brown fox quick blue dog lazy',\n",
       "       'sky blue sky beautiful today', 'dog lazy brown fox quick'],\n",
       "      dtype='<U30')"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "norm_corpus = normalize_corpus(corpus)\n",
    "norm_corpus"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Bag of Words Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0],\n",
       "       [1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0],\n",
       "       [0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0],\n",
       "       [0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0],\n",
       "       [1, 1, 0, 0, 0, 0, 0, 0, 0, 2, 1],\n",
       "       [0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0]], dtype=int64)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "\n",
    "cv = CountVectorizer(min_df=0., max_df=1.)\n",
    "cv_matrix = cv.fit_transform(norm_corpus)\n",
    "cv_matrix = cv_matrix.toarray()\n",
    "cv_matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>beautiful</th>\n",
       "      <th>blue</th>\n",
       "      <th>brown</th>\n",
       "      <th>dog</th>\n",
       "      <th>fox</th>\n",
       "      <th>jumps</th>\n",
       "      <th>lazy</th>\n",
       "      <th>love</th>\n",
       "      <th>quick</th>\n",
       "      <th>sky</th>\n",
       "      <th>today</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   beautiful  blue  brown  dog  fox  jumps  lazy  love  quick  sky  today\n",
       "0          1     1      0    0    0      0     0     0      0    1      0\n",
       "1          1     1      0    0    0      0     0     1      0    1      0\n",
       "2          0     0      1    1    1      1     1     0      1    0      0\n",
       "3          0     1      1    1    1      0     1     0      1    0      0\n",
       "4          1     1      0    0    0      0     0     0      0    2      1\n",
       "5          0     0      1    1    1      0     1     0      1    0      0"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vocab = cv.get_feature_names()\n",
    "pd.DataFrame(cv_matrix, columns=vocab)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Bag of N-Grams Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>beautiful sky</th>\n",
       "      <th>beautiful today</th>\n",
       "      <th>blue beautiful</th>\n",
       "      <th>blue dog</th>\n",
       "      <th>blue sky</th>\n",
       "      <th>brown fox</th>\n",
       "      <th>dog lazy</th>\n",
       "      <th>fox jumps</th>\n",
       "      <th>fox quick</th>\n",
       "      <th>jumps lazy</th>\n",
       "      <th>lazy brown</th>\n",
       "      <th>lazy dog</th>\n",
       "      <th>love blue</th>\n",
       "      <th>quick blue</th>\n",
       "      <th>quick brown</th>\n",
       "      <th>sky beautiful</th>\n",
       "      <th>sky blue</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   beautiful sky  beautiful today  blue beautiful  blue dog  blue sky  \\\n",
       "0              0                0               1         0         0   \n",
       "1              1                0               1         0         0   \n",
       "2              0                0               0         0         0   \n",
       "3              0                0               0         1         0   \n",
       "4              0                1               0         0         1   \n",
       "5              0                0               0         0         0   \n",
       "\n",
       "   brown fox  dog lazy  fox jumps  fox quick  jumps lazy  lazy brown  \\\n",
       "0          0         0          0          0           0           0   \n",
       "1          0         0          0          0           0           0   \n",
       "2          1         0          1          0           1           0   \n",
       "3          1         1          0          1           0           0   \n",
       "4          0         0          0          0           0           0   \n",
       "5          1         1          0          1           0           1   \n",
       "\n",
       "   lazy dog  love blue  quick blue  quick brown  sky beautiful  sky blue  \n",
       "0         0          0           0            0              0         1  \n",
       "1         0          1           0            0              0         0  \n",
       "2         1          0           0            1              0         0  \n",
       "3         0          0           1            0              0         0  \n",
       "4         0          0           0            0              1         1  \n",
       "5         0          0           0            0              0         0  "
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "bv = CountVectorizer(ngram_range=(2,2))\n",
    "bv_matrix = bv.fit_transform(norm_corpus)\n",
    "bv_matrix = bv_matrix.toarray()\n",
    "vocab = bv.get_feature_names()\n",
    "pd.DataFrame(bv_matrix, columns=vocab)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# TF-IDF Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>beautiful</th>\n",
       "      <th>blue</th>\n",
       "      <th>brown</th>\n",
       "      <th>dog</th>\n",
       "      <th>fox</th>\n",
       "      <th>jumps</th>\n",
       "      <th>lazy</th>\n",
       "      <th>love</th>\n",
       "      <th>quick</th>\n",
       "      <th>sky</th>\n",
       "      <th>today</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.60</td>\n",
       "      <td>0.52</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.60</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.46</td>\n",
       "      <td>0.39</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.66</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.46</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.38</td>\n",
       "      <td>0.38</td>\n",
       "      <td>0.38</td>\n",
       "      <td>0.54</td>\n",
       "      <td>0.38</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.38</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.00</td>\n",
       "      <td>0.36</td>\n",
       "      <td>0.42</td>\n",
       "      <td>0.42</td>\n",
       "      <td>0.42</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.42</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.42</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.36</td>\n",
       "      <td>0.31</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.72</td>\n",
       "      <td>0.52</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.45</td>\n",
       "      <td>0.45</td>\n",
       "      <td>0.45</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.45</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.45</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   beautiful  blue  brown   dog   fox  jumps  lazy  love  quick   sky  today\n",
       "0       0.60  0.52   0.00  0.00  0.00   0.00  0.00  0.00   0.00  0.60   0.00\n",
       "1       0.46  0.39   0.00  0.00  0.00   0.00  0.00  0.66   0.00  0.46   0.00\n",
       "2       0.00  0.00   0.38  0.38  0.38   0.54  0.38  0.00   0.38  0.00   0.00\n",
       "3       0.00  0.36   0.42  0.42  0.42   0.00  0.42  0.00   0.42  0.00   0.00\n",
       "4       0.36  0.31   0.00  0.00  0.00   0.00  0.00  0.00   0.00  0.72   0.52\n",
       "5       0.00  0.00   0.45  0.45  0.45   0.00  0.45  0.00   0.45  0.00   0.00"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "\n",
    "tv = TfidfVectorizer(min_df=0., max_df=1., use_idf=True)\n",
    "tv_matrix = tv.fit_transform(norm_corpus)\n",
    "tv_matrix = tv_matrix.toarray()\n",
    "\n",
    "vocab = tv.get_feature_names()\n",
    "pd.DataFrame(np.round(tv_matrix, 2), columns=vocab)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Document Similarity"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.753128</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.185447</td>\n",
       "      <td>0.807539</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.753128</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.139665</td>\n",
       "      <td>0.608181</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.784362</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.839987</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.185447</td>\n",
       "      <td>0.139665</td>\n",
       "      <td>0.784362</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.109653</td>\n",
       "      <td>0.933779</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.807539</td>\n",
       "      <td>0.608181</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.109653</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.839987</td>\n",
       "      <td>0.933779</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          0         1         2         3         4         5\n",
       "0  1.000000  0.753128  0.000000  0.185447  0.807539  0.000000\n",
       "1  0.753128  1.000000  0.000000  0.139665  0.608181  0.000000\n",
       "2  0.000000  0.000000  1.000000  0.784362  0.000000  0.839987\n",
       "3  0.185447  0.139665  0.784362  1.000000  0.109653  0.933779\n",
       "4  0.807539  0.608181  0.000000  0.109653  1.000000  0.000000\n",
       "5  0.000000  0.000000  0.839987  0.933779  0.000000  1.000000"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "\n",
    "similarity_matrix = cosine_similarity(tv_matrix)\n",
    "similarity_df = pd.DataFrame(similarity_matrix)\n",
    "similarity_df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Clustering documents using similarity features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Document</th>\n",
       "      <th>Category</th>\n",
       "      <th>ClusterLabel</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>The sky is blue and beautiful.</td>\n",
       "      <td>weather</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Love this blue and beautiful sky!</td>\n",
       "      <td>weather</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>The quick brown fox jumps over the lazy dog.</td>\n",
       "      <td>animals</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>The brown fox is quick and the blue dog is lazy!</td>\n",
       "      <td>animals</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>The sky is very blue and the sky is very beaut...</td>\n",
       "      <td>weather</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>The dog is lazy but the brown fox is quick!</td>\n",
       "      <td>animals</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                            Document Category  ClusterLabel\n",
       "0                     The sky is blue and beautiful.  weather             1\n",
       "1                  Love this blue and beautiful sky!  weather             1\n",
       "2       The quick brown fox jumps over the lazy dog.  animals             0\n",
       "3   The brown fox is quick and the blue dog is lazy!  animals             0\n",
       "4  The sky is very blue and the sky is very beaut...  weather             1\n",
       "5        The dog is lazy but the brown fox is quick!  animals             0"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.cluster import KMeans\n",
    "\n",
    "km = KMeans(n_clusters=2)\n",
    "km.fit_transform(similarity_df)\n",
    "cluster_labels = km.labels_\n",
    "cluster_labels = pd.DataFrame(cluster_labels, columns=['ClusterLabel'])\n",
    "pd.concat([corpus_df, cluster_labels], axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Topic models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Program Files\\Anaconda3\\lib\\site-packages\\sklearn\\decomposition\\online_lda.py:508: DeprecationWarning: The default value for 'learning_method' will be changed from 'online' to 'batch' in the release 0.20. This warning was introduced in 0.18.\n",
      "  DeprecationWarning)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>T1</th>\n",
       "      <th>T2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.190615</td>\n",
       "      <td>0.809385</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.176860</td>\n",
       "      <td>0.823140</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.846148</td>\n",
       "      <td>0.153852</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.815229</td>\n",
       "      <td>0.184771</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.180563</td>\n",
       "      <td>0.819437</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0.839140</td>\n",
       "      <td>0.160860</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         T1        T2\n",
       "0  0.190615  0.809385\n",
       "1  0.176860  0.823140\n",
       "2  0.846148  0.153852\n",
       "3  0.815229  0.184771\n",
       "4  0.180563  0.819437\n",
       "5  0.839140  0.160860"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.decomposition import LatentDirichletAllocation\n",
    "\n",
    "lda = LatentDirichletAllocation(n_topics=2, max_iter=100, random_state=42)\n",
    "dt_matrix = lda.fit_transform(tv_matrix)\n",
    "features = pd.DataFrame(dt_matrix, columns=['T1', 'T2'])\n",
    "features"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Show topics and their weights"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('fox', 1.7265536238698524), ('quick', 1.7264910761871224), ('dog', 1.7264019823624879), ('brown', 1.7263774760262807), ('lazy', 1.7263567668213813), ('jumps', 1.0326450363521607), ('blue', 0.7770158513472083)]\n",
      "\n",
      "[('sky', 2.263185143458752), ('beautiful', 1.9057084998062579), ('blue', 1.7954559705805626), ('love', 1.1476805311187976), ('today', 1.0064979209198706)]\n",
      "\n"
     ]
    }
   ],
   "source": [
    "tt_matrix = lda.components_\n",
    "for topic_weights in tt_matrix:\n",
    "    topic = [(token, weight) for token, weight in zip(vocab, topic_weights)]\n",
    "    topic = sorted(topic, key=lambda x: -x[1])\n",
    "    topic = [item for item in topic if item[1] > 0.6]\n",
    "    print(topic)\n",
    "    print()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Clustering documents using topic model features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Document</th>\n",
       "      <th>Category</th>\n",
       "      <th>ClusterLabel</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>The sky is blue and beautiful.</td>\n",
       "      <td>weather</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Love this blue and beautiful sky!</td>\n",
       "      <td>weather</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>The quick brown fox jumps over the lazy dog.</td>\n",
       "      <td>animals</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>The brown fox is quick and the blue dog is lazy!</td>\n",
       "      <td>animals</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>The sky is very blue and the sky is very beaut...</td>\n",
       "      <td>weather</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>The dog is lazy but the brown fox is quick!</td>\n",
       "      <td>animals</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                            Document Category  ClusterLabel\n",
       "0                     The sky is blue and beautiful.  weather             0\n",
       "1                  Love this blue and beautiful sky!  weather             0\n",
       "2       The quick brown fox jumps over the lazy dog.  animals             1\n",
       "3   The brown fox is quick and the blue dog is lazy!  animals             1\n",
       "4  The sky is very blue and the sky is very beaut...  weather             0\n",
       "5        The dog is lazy but the brown fox is quick!  animals             1"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "km = KMeans(n_clusters=2)\n",
    "km.fit_transform(features)\n",
    "cluster_labels = km.labels_\n",
    "cluster_labels = pd.DataFrame(cluster_labels, columns=['ClusterLabel'])\n",
    "pd.concat([corpus_df, cluster_labels], axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Word Embeddings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Program Files\\Anaconda3\\lib\\site-packages\\gensim\\utils.py:860: UserWarning: detected Windows; aliasing chunkize to chunkize_serial\n",
      "  warnings.warn(\"detected Windows; aliasing chunkize to chunkize_serial\")\n",
      "Using TensorFlow backend.\n"
     ]
    }
   ],
   "source": [
    "from gensim.models import word2vec\n",
    "\n",
    "wpt = nltk.WordPunctTokenizer()\n",
    "tokenized_corpus = [wpt.tokenize(document) for document in norm_corpus]\n",
    "\n",
    "# Set values for various parameters\n",
    "feature_size = 10    # Word vector dimensionality  \n",
    "window_context = 10          # Context window size                                                                                    \n",
    "min_word_count = 1   # Minimum word count                        \n",
    "sample = 1e-3   # Downsample setting for frequent words\n",
    "\n",
    "w2v_model = word2vec.Word2Vec(tokenized_corpus, size=feature_size, \n",
    "                          window=window_context, min_count = min_word_count,\n",
    "                          sample=sample)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 0.01000661, -0.02096199,  0.03674392, -0.03230235,  0.02149896,\n",
       "       -0.01101904,  0.01973963,  0.00615971, -0.01040678, -0.00668737], dtype=float32)"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "w2v_model.wv['sky']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def average_word_vectors(words, model, vocabulary, num_features):\n",
    "    \n",
    "    feature_vector = np.zeros((num_features,),dtype=\"float64\")\n",
    "    nwords = 0.\n",
    "    \n",
    "    for word in words:\n",
    "        if word in vocabulary: \n",
    "            nwords = nwords + 1.\n",
    "            feature_vector = np.add(feature_vector, model[word])\n",
    "    \n",
    "    if nwords:\n",
    "        feature_vector = np.divide(feature_vector, nwords)\n",
    "        \n",
    "    return feature_vector\n",
    "    \n",
    "   \n",
    "def averaged_word_vectorizer(corpus, model, num_features):\n",
    "    vocabulary = set(model.wv.index2word)\n",
    "    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)\n",
    "                    for tokenized_sentence in corpus]\n",
    "    return np.array(features)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>9</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-0.010540</td>\n",
       "      <td>-0.015367</td>\n",
       "      <td>0.005373</td>\n",
       "      <td>-0.020741</td>\n",
       "      <td>0.030717</td>\n",
       "      <td>-0.022407</td>\n",
       "      <td>-0.001724</td>\n",
       "      <td>0.004722</td>\n",
       "      <td>0.026881</td>\n",
       "      <td>0.011909</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-0.017797</td>\n",
       "      <td>-0.013693</td>\n",
       "      <td>-0.003599</td>\n",
       "      <td>-0.015436</td>\n",
       "      <td>0.022831</td>\n",
       "      <td>-0.017905</td>\n",
       "      <td>0.010470</td>\n",
       "      <td>0.001540</td>\n",
       "      <td>0.025658</td>\n",
       "      <td>0.016208</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>-0.020869</td>\n",
       "      <td>-0.018273</td>\n",
       "      <td>-0.019681</td>\n",
       "      <td>-0.004124</td>\n",
       "      <td>-0.010980</td>\n",
       "      <td>0.001654</td>\n",
       "      <td>-0.001310</td>\n",
       "      <td>0.003395</td>\n",
       "      <td>0.003760</td>\n",
       "      <td>0.010851</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>-0.017561</td>\n",
       "      <td>-0.017866</td>\n",
       "      <td>-0.016438</td>\n",
       "      <td>-0.007601</td>\n",
       "      <td>-0.005687</td>\n",
       "      <td>-0.008843</td>\n",
       "      <td>-0.002385</td>\n",
       "      <td>0.001444</td>\n",
       "      <td>0.005643</td>\n",
       "      <td>0.012638</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.002371</td>\n",
       "      <td>-0.006731</td>\n",
       "      <td>0.017480</td>\n",
       "      <td>-0.014220</td>\n",
       "      <td>0.022088</td>\n",
       "      <td>-0.014882</td>\n",
       "      <td>0.003067</td>\n",
       "      <td>0.002605</td>\n",
       "      <td>0.021167</td>\n",
       "      <td>0.006461</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>-0.018306</td>\n",
       "      <td>-0.012056</td>\n",
       "      <td>-0.015671</td>\n",
       "      <td>-0.011617</td>\n",
       "      <td>-0.011667</td>\n",
       "      <td>-0.005490</td>\n",
       "      <td>0.005404</td>\n",
       "      <td>-0.003512</td>\n",
       "      <td>-0.003198</td>\n",
       "      <td>0.013306</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          0         1         2         3         4         5         6  \\\n",
       "0 -0.010540 -0.015367  0.005373 -0.020741  0.030717 -0.022407 -0.001724   \n",
       "1 -0.017797 -0.013693 -0.003599 -0.015436  0.022831 -0.017905  0.010470   \n",
       "2 -0.020869 -0.018273 -0.019681 -0.004124 -0.010980  0.001654 -0.001310   \n",
       "3 -0.017561 -0.017866 -0.016438 -0.007601 -0.005687 -0.008843 -0.002385   \n",
       "4  0.002371 -0.006731  0.017480 -0.014220  0.022088 -0.014882  0.003067   \n",
       "5 -0.018306 -0.012056 -0.015671 -0.011617 -0.011667 -0.005490  0.005404   \n",
       "\n",
       "          7         8         9  \n",
       "0  0.004722  0.026881  0.011909  \n",
       "1  0.001540  0.025658  0.016208  \n",
       "2  0.003395  0.003760  0.010851  \n",
       "3  0.001444  0.005643  0.012638  \n",
       "4  0.002605  0.021167  0.006461  \n",
       "5 -0.003512 -0.003198  0.013306  "
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "w2v_feature_array = averaged_word_vectorizer(corpus=tokenized_corpus, model=w2v_model,\n",
    "                                             num_features=feature_size)\n",
    "pd.DataFrame(w2v_feature_array)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Document</th>\n",
       "      <th>Category</th>\n",
       "      <th>ClusterLabel</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>The sky is blue and beautiful.</td>\n",
       "      <td>weather</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Love this blue and beautiful sky!</td>\n",
       "      <td>weather</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>The quick brown fox jumps over the lazy dog.</td>\n",
       "      <td>animals</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>The brown fox is quick and the blue dog is lazy!</td>\n",
       "      <td>animals</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>The sky is very blue and the sky is very beaut...</td>\n",
       "      <td>weather</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>The dog is lazy but the brown fox is quick!</td>\n",
       "      <td>animals</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                            Document Category  ClusterLabel\n",
       "0                     The sky is blue and beautiful.  weather             0\n",
       "1                  Love this blue and beautiful sky!  weather             0\n",
       "2       The quick brown fox jumps over the lazy dog.  animals             1\n",
       "3   The brown fox is quick and the blue dog is lazy!  animals             1\n",
       "4  The sky is very blue and the sky is very beaut...  weather             0\n",
       "5        The dog is lazy but the brown fox is quick!  animals             1"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.cluster import AffinityPropagation\n",
    "\n",
    "ap = AffinityPropagation()\n",
    "ap.fit(w2v_feature_array)\n",
    "cluster_labels = ap.labels_\n",
    "cluster_labels = pd.DataFrame(cluster_labels, columns=['ClusterLabel'])\n",
    "pd.concat([corpus_df, cluster_labels], axis=1)"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [conda root]",
   "language": "python",
   "name": "conda-root-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
